cont <- params$controlCellLine
exp <- params$experimentalCellLine
countmatrix.all <- params$countmatrix.all
metadata.all <- params$metadata.all
rm(params) # Remove the parameters so that we can make subsequent parameterized calls
metadata.pair <- as.data.frame(metadata.all) %>%
filter(CellLine == cont | CellLine == exp)
as.data.frame(metadata.pair)
countmatrix.pair <- countmatrix.all[, metadata.pair$ShortName]
as.data.frame(countmatrix.pair)
Run Deseq on the data set.
# Saving time by just loading the dds we already ran (recent changes are all after this point)
load(str_interp("Rdata/${exp}_vs_${cont}_dds.RData"))
res <- results(dds.pair, contrast = c("CellLine", exp, cont), alpha = 0.05)
res <- res[order(res$log2FoldChange), ]
outFile <- str_interp("output/${exp}_vs_${cont}_deseq_results.csv")
write.csv(as.data.frame(res), file = outFile)
Filter res for padj < 0.05
res.filtered <- as.data.frame(res) %>%
filter(padj < 0.05)
# filter(log2FoldChange >= 1.5 | log2FoldChange <= -1.5)
res.filtered <- res.filtered[order(res.filtered$log2FoldChange, decreasing = TRUE),]
res.filtered
up.unfiltered <- subset(res, log2FoldChange > 0)
up.unfiltered <- up.unfiltered[order(up.unfiltered$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_upregulated_genes.csv")
write.csv(up.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
up.unfiltered[, c("log2FoldChange", "padj")]
log2 fold change (MLE): CellLine PEA2 vs PEA1
DataFrame with 9935 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
SULT1A3 23.0809 2.51448e-08
EHF 16.7101 3.10531e-42
CCDC129 14.9666 6.21388e-34
AGR2 14.4740 1.10314e-31
ESRP1 14.2988 5.77523e-31
... ... ...
FLYWCH1 8.04855e-04 0.998286
RN7SK 5.72356e-04 0.999080
ISG20L2 5.70131e-04 0.998178
CYB5B 5.51788e-04 0.998286
MYO5A 7.72654e-05 0.999854
up <- subset(res.filtered, log2FoldChange > 0)
up <- up[order(up$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_genes.csv")
write.csv(up[, c("log2FoldChange", "padj")], file = outFile)
print(up[, c("log2FoldChange", "padj")])
down.unfiltered <- subset(res, log2FoldChange < 0)
down.unfiltered <- down.unfiltered[order(down.unfiltered$log2FoldChange, decreasing = FALSE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_downregulated_genes.csv")
write.csv(down.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
print(down.unfiltered[, c("log2FoldChange", "padj")])
log2 fold change (MLE): CellLine PEA2 vs PEA1
DataFrame with 11299 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
IPO11-LRRC70 -23.0428 2.66766e-08
LOC101929304 -22.2940 7.88754e-08
TM4SF18 -14.9989 3.51917e-34
FAT1 -14.6631 4.91444e-20
TRNAD17 -12.9771 2.87443e-03
... ... ...
COX8A -1.00084e-03 0.996937
NELFE -5.88644e-04 0.998178
WASH5P -5.15565e-04 0.998848
RPN2 -3.55934e-04 0.998848
SDHAF2 -1.90322e-05 0.999942
down <- subset(res.filtered, log2FoldChange < 0)
down <- down[order(down$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_genes.csv")
write.csv(down[, c("log2FoldChange", "padj")], file = outFile)
print(down[, c("log2FoldChange", "padj")])
as.data.frame(res) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj), label = rownames(res))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: Removed 11833 rows containing missing values (`geom_point()`).
Warning: Removed 11833 rows containing missing values (`geom_text_repel()`).
Warning: ggrepel: 17884 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a pvalue Cutoff of 0.05
gene_list <- res$log2FoldChange
names(gene_list) <- rownames(res)
gene_list <- sort(gene_list, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_res <- gseGO(gene_list, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (13.32% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize = minSize, : For some
pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for
better estimation.
leading edge analysis...
done...
# Format output
gsea_res_df <- as.data.frame(gsea_res)
gsea_res_df <- gsea_res_df %>%
mutate(original_row_num = row_number())
gsea_res_df <- gsea_res_df[order(gsea_res_df$NES, decreasing = TRUE),]
row.names(gsea_res_df) <- gsea_res_df$ID
NES is the normalized enrichment score.
gsea_res_df_short <- gsea_res_df[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short$"core_enrichment_genes" <- gsea_res_df$core_enrichment
gsea_res_df_short.up <- subset(gsea_res_df_short, gsea_res_df_short$NES >= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up, file = outFile)
gsea_res_df_short.up
GSEA plot of the five most upregulated pathways (or least downregulated)
maxIndex <- min(5, nrow(gsea_res_df)) # Prevents us from trying to access out of bounds if there are not five pathways
top5PathwaysIds = gsea_res_df[1:maxIndex, "original_row_num"]
gseaplot2(gsea_res, geneSetID = top5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano Plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.up) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.up))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 80 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
gsea_res_df_short.down <- subset(gsea_res_df_short, gsea_res_df_short$NES <= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down, file = outFile)
gsea_res_df_short.down
GSEA plot of the five most downregulated pathways (or least upregulated)
minIndex <- max(1, nrow(gsea_res_df) - 5) # Prevents us from trying to access out of bounds if there are not five downregulated pathways
bottom5PathwaysIds = gsea_res_df[minIndex:nrow(gsea_res_df), "original_row_num"]
gseaplot2(gsea_res, geneSetID = bottom5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.down) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.down))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Use Revigo to cluster upregulated pathways
revigo_input.cellline.up <- gsea_res_df_short.up[c("p.adjust")]
rownames(revigo_input.cellline.up) <- rownames(gsea_res_df_short.up)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.up$p.adjust), rownames(revigo_input.cellline.up))
if (nrow(revigo_input.cellline.up) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster downregulated pathways
revigo_input.cellline.down <- gsea_res_df_short.down[c("p.adjust")]
rownames(revigo_input.cellline.down) <- rownames(gsea_res_df_short.down)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down$p.adjust), rownames(revigo_input.cellline.down))
if (nrow(revigo_input.cellline.down) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly downregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}